library(dplyr)
library(readr)
library(tibble)
library(tidyr)
library(purrr)
library(broom)
library(pheatmap)
library(plotly)
library(microbiome)
library(knitr)

How does high fat diet impact the microbiome of mice?

Picture of obese mice

Picture of obese mice

# load metadata
metadata <- read_tsv('Example/metadata.txt')
kable(metadata)
X1 Age Air_temperature Diet Food_name Country Provider Sex Strain Body_weight
ERR675518 17 30 chow D12450B Norway Taconic in Denmark male C57/BL6 25.62
ERR675519 17 30 HF D12492 Norway Taconic in Denmark male C57/BL6 30.15
ERR675520 17 30 chow D12450B Norway Taconic in Denmark male C57/BL6 26.61
ERR675521 17 30 HF D12492 Norway Taconic in Denmark male C57/BL6 28.29
ERR675522 17 30 chow D12450B Norway Taconic in Denmark male C57/BL6 26.49
ERR675523 17 30 HF D12492 Norway Taconic in Denmark male C57/BL6 31.42
ERR675524 17 30 chow D12450B Norway Taconic in Denmark male C57/BL6 27.70
ERR675525 17 30 HF D12492 Norway Taconic in Denmark male C57/BL6 29.54
ERR675526 17 30 chow D12450B Norway Taconic in Denmark male C57/BL6 27.48
ERR675527 17 30 chow D12450B Norway Taconic in Denmark male C57/BL6 28.16
ERR675528 17 30 HF D12492 Norway Taconic in Denmark male C57/BL6 30.08
ERR675529 17 30 chow D12450B Norway Taconic in Denmark male C57/BL6 27.44
ERR675530 17 30 HF D12492 Norway Taconic in Denmark male C57/BL6 34.15
ERR675531 17 30 HF D12492 Norway Taconic in Denmark male C57/BL6 30.61
ERR675532 17 30 HF D12492 Norway Taconic in Denmark male C57/BL6 30.84

We confirm that the mice on high fat diet really put more weight on.

ggplot(metadata, aes(x = Diet, y = Body_weight)) +
  geom_point() +
  theme_minimal()

# create a short label for each genome
Tax <- read_tsv('Example/Results/taxonomy.tsv') %>%
  mutate(Labels = ifelse(is.na(species) & is.na(genus), paste0(family, " ", user_genome), species)) %>%
  mutate(Labels = ifelse(is.na(Labels), paste0(genus, " ", user_genome), Labels))

Relative abundance

For the relative abundance we take the coverage over the genome not the raw counts. This inmplicit normalizes for genome size. The coverage is calculated as the median of the coverage values calculated in 1kb blocks.

D <- read_tsv("Example/Results/counts/median_coverage_genomes.tsv") %>%
  column_to_rownames(var = "X1")
# calculate relative abundance
rel_ab <- D/rowSums(D)

Bar chart wich group labels

level <- 'family'

grouped_data <- rel_ab %>%
  t() %>%
  as.data.frame() %>%
  rownames_to_column(var = "user_genome") %>%
  pivot_longer(cols = -user_genome, names_to = "Sample", values_to = "rel_ab") %>%
  left_join(Tax, by = "user_genome") %>%
  left_join(metadata, by = c("Sample" = "X1")) %>%
  group_by(Sample, family, Diet) %>%
  summarise(summarized_rel_ab = sum(rel_ab))

ggplot(grouped_data, aes(x = Sample, y = summarized_rel_ab, fill = family)) +
  geom_col() +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90), 
        axis.text.y = element_blank(), 
        axis.title.y = element_blank()) +
  facet_wrap(~Diet, scales = "free_x")

We see that the high fat diet induces a big change from the Muribaculaceae family (Bacteroidetes) to the Lachnospiraceae family.

Compositional data analysis

In order to analyze the microbiome at the species or genome level we use compositional data analyis (CoDa), see more on Wikipedia and this article:

Gloor, Gregory B., Jean M. Macklaim, Vera Pawlowsky-Glahn, and Juan J. Egozcue. 2017. “Microbiome Datasets Are Compositional: And This Is Not Optional.” Frontiers in Microbiology 8 (November). Frontiers: 2224. doi: 10.3389/fmicb.2017.02224.

For this we load the rawcounts and use centric log ratios (clr) after imputing values for the zeros.

#load raw counts

Counts <- read_tsv('Example/Results/counts/raw_counts_genomes.tsv') %>%
  column_to_rownames(var = "Sample") %>%
  t()

# transforme counts with centrig log ratio
data <- transform(Counts, transform = "clr")

PCA (PCoA) of the robust aitchison distance

transformed_data <- prcomp(data)
pca_data <- transformed_data$x %>%
  as.data.frame() %>%
  rownames_to_column(var = "Sample") %>%
  left_join(metadata, by = c("Sample" = "X1"))

ggplot(pca_data, aes(x = PC1, y = PC2, color = Diet)) +
  geom_point() +
  theme_minimal() +
  scale_color_manual(values = c(chow = "#00BFC4", HF = "#F8766D"))

Differencial abundance analyis

As the counts are normalized in centered log ratio the log FC becomes the difference.

We use the welch test to assess differential abundance in the two groups. This is a simple version of aldex2. See Gloor et al for more information.

# mean abundance per group
Stats <- data %>%
  as.data.frame() %>%
  rownames_to_column(var = "Sample") %>%
  pivot_longer(cols = -Sample, names_to = "Id", values_to = "clr") %>%
  left_join(metadata, by = c("Sample" = "X1")) %>%
  group_by(Diet, Id) %>%
  summarize(mean_clr = mean(clr)) %>%
  pivot_wider(id_cols = Id, names_from = Diet, values_from = mean_clr)

# calculate logFC
Stats <- Stats %>%
  mutate(logFC = HF - chow)

# format data for t test
data_ttest <- data %>%
  as.data.frame() %>%
  rownames_to_column(var = "Sample") %>%
  pivot_longer(cols = -Sample, names_to = "Id", values_to = "clr") %>%
  left_join(metadata, by = c("Sample" = "X1"))

# run t test
data_ttest <- data_ttest %>%
  group_by(Id, Diet) %>% 
  nest() %>% 
  spread(key = Diet, value = data) %>% 
  mutate(
    t_test = map2(HF, chow, ~{t.test(.x$clr, .y$clr) %>% tidy()}),
    HF = map(HF, nrow),
    chow = map(chow, nrow)
  ) %>% 
  unnest() %>%
  select(Id, p.value) %>%
  mutate(Pvalue = p.value) %>%
  mutate(logP = -log10(Pvalue))

Stats <- Stats %>%
  left_join(data_ttest, by = "Id") %>%
  left_join(Tax, by = c("Id" = "user_genome"))

heatmap of significant Genomes

Correcting form multiple testing would be even better

# filter to MAG abundances that were significantly different
sig_data <- data %>%
  as.data.frame() %>%
  select(Stats[Stats$Pvalue < 0.01, ]$Id) %>%
  t()

# make a dataframe to use to annotate the heatmap
annot_df <- data.frame(Sample = colnames(sig_data)) %>%
  left_join(metadata, by = c("Sample" = "X1")) %>%
  column_to_rownames(var = "Sample") %>%
  select(Diet)

# sort labels by sig_data order
heatmap_labels <- Tax %>%
  filter(user_genome %in% rownames(sig_data)) 
heatmap_labels <- heatmap_labels[order(match(heatmap_labels$user_genome, rownames(sig_data))), ]

pheatmap(sig_data, cluster_rows = T, cluster_cols = T, annotation_col = annot_df,
         labels_row = heatmap_labels$Labels)

Volcano plot

# non interactive plot
ggplot(Stats, aes(x = logFC, y = logP, alpha = logP)) +
  geom_point(color = "#67000d") +
  theme_minimal()

plt <- ggplot(Stats, aes(x = logFC, y = logP, alpha = logP,
                         label = Labels, label2 = HF, label3 = chow)) +
  geom_point(color = "#67000d") +
  theme_minimal()

ggplotly(plt, tooltip = c("label", "label2", "label3"))

The uncultured species with the name ‘UBA7173 sp001689485’ is highly significantly increased in chow mice vs HF mice. It belongs to the Muribaculaceae family.

ggplot(data %>%
         as.data.frame %>%
         rownames_to_column(var = "Sample") %>%
         left_join(metadata, by = c("Sample" = "X1")), 
         aes(y = MAG10, x = Diet, fill = Diet)) +
  geom_boxplot() +
  theme_minimal() +
  scale_fill_manual(values = c(chow = "#00BFC4", HF = "#F8766D"))

kable(Tax %>% 
  filter(user_genome == 'MAG10'))
user_genome kindom phylum class order family genus species Labels
MAG10 Bacteria Bacteroidota Bacteroidia Bacteroidales Muribaculaceae UBA7173 UBA7173 sp001689485 UBA7173 sp001689485